DATASET INFORMATION
Number of attributes 82 (77)
Classes (8):
c-SC-m: control mice, not stimulated to learn, injected with memantine (10 mice)
t-CS-s: trisomy mice, stimulated to learn, injected with saline (7 mice)
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import plotly.io as pio
pio.renderers.default = 'notebook'
file = r'data/Data_Cortex_Nuclear.xls'
df = pd.read_excel(file)
df.head()
df.shape
df = df.drop([df.columns[0], df.columns[-2], df.columns[-3], df.columns[-4]], axis = 1)
df.shape
df.describe()
df.isnull().sum()
df.isnull().sum(axis = 1)
df = df.dropna(how = 'any', thresh = 70)
df.shape
df = df.fillna(df.mean())
df.isnull().sum(axis = 1)
scaler = StandardScaler()
data = df.loc[:, 'DYRK1A_N':'CaNA_N']
scaler.fit(data)
data_scaled = scaler.transform(data)
data_scaled
pca = PCA(n_components = 2)
pca.fit(data_scaled)
data_pca = pca.transform(data_scaled)
print("Original shape: {}".format(str(data_scaled.shape)))
print("Reduced shape: {}".format(str(data_pca.shape)))
plt.figure(figsize=(10, 10))
plt.scatter(data_pca[:, 0], data_pca[:, 1])
plt.xlabel("First principal component")
plt.ylabel("Second principal component")
print("Explained Variance Ratio:", pca.explained_variance_ratio_.sum())
pca_3d = PCA(n_components = 3)
pca_3d.fit(data_scaled)
data_pca_3d = pca_3d.transform(data_scaled)
print("Original shape: {}".format(str(data_scaled.shape)))
print("Reduced shape: {}".format(str(data_pca_3d.shape)))
dictionary = {
'First Principal Component': data_pca_3d[:, 0],
'Second Principal Component': data_pca_3d[:, 1],
'Third Principal Component': data_pca_3d[:, 2]
}
df_pca = pd.DataFrame(dictionary)
fig = px.scatter_3d(df_pca, x='First Principal Component', y='Second Principal Component', z='Third Principal Component')
fig.show()
print("Explained Variance Ratio:", pca_3d.explained_variance_ratio_.sum())
tsne = TSNE(random_state = 42)
# tsne has no transform method, fit_transform is used instead
data_tsne = tsne.fit_transform(data_scaled)
print("Original shape: {}".format(str(data_scaled.shape)))
print("Reduced shape: {}".format(str(data_tsne.shape)))
plt.figure(figsize = (10, 10))
plt.scatter(data_tsne[:, 0], data_tsne[:, 1])
plt.xlabel("t-SNE feature 0")
plt.ylabel("t-SNE feature 1")
tsne_3d = TSNE(n_components= 3, random_state = 42)
# tsne has no transform method, fit_transform is used instead
data_tsne_3d = tsne_3d.fit_transform(data_scaled)
print("Original shape: {}".format(str(data_scaled.shape)))
print("Reduced shape: {}".format(str(data_tsne_3d.shape)))
dictionary = {
't-SNE feature 0': data_tsne_3d[:, 0],
't-SNE feature 1': data_tsne_3d[:, 1],
't-SNE feature 2': data_tsne_3d[:, 2]
}
df_tsne = pd.DataFrame(dictionary)
fig = px.scatter_3d(df_tsne, x='t-SNE feature 0', y='t-SNE feature 1', z='t-SNE feature 2')
fig.show()
pca = PCA(n_components = None)
pca.fit(data_scaled)
data_new = pca.transform(data_scaled)
plt.figure(figsize = (10,10))
sns.barplot(x = np.arange(data_scaled.shape[1]), y = pca.explained_variance_ratio_, color = "c")
plt.xticks([])
plt.title("Explained Variance Ratio of Each Principle Component")
plt.show()
plt.figure()
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)
plt.plot(np.arange(data_scaled.shape[1]), cumulative_explained_variance, color = "c")
plt.xticks([])
plt.title("Cumulative Explained Variance Ratio of Principle Component")
plt.show()
res = list(map(lambda i: i > 0.8, cumulative_explained_variance)).index(True)
print("Cumulative variance of 80% reached at component", res)
pca_9d = PCA(n_components = 9)
pca_9d.fit(data_scaled)
data_pca_9d = pca_9d.transform(data_scaled)
print("Original shape: {}".format(str(data_scaled.shape)))
print("Reduced shape: {}".format(str(data_pca_9d.shape)))
X = data_pca_9d
# y = df['class'].to_numpy()
kmeans = KMeans(init = 'random', n_clusters = 8)
kmeans.fit(X)
y_pred = kmeans.predict(X)
tsne = TSNE(random_state = 42)
# tsne has no transform method, fit_transform is used instead
data_tsne = tsne.fit_transform(X)
plt.figure(figsize = (10, 10))
plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c = y_pred)
plt.xlabel("t-SNE feature 0")
plt.ylabel("t-SNE feature 1")
kmeans = KMeans(init = 'k-means++', n_clusters = 8, random_state = 0)
kmeans.fit(X)
y_pred = kmeans.predict(X)
tsne = TSNE(random_state = 42)
# tsne has no transform method, fit_transform is used instead
data_tsne = tsne.fit_transform(X)
plt.figure(figsize = (10, 10))
plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c = y_pred)
plt.xlabel("t-SNE feature 0")
plt.ylabel("t-SNE feature 1")
hierarchical = AgglomerativeClustering(n_clusters = 8, affinity='euclidean', linkage = 'ward')
y_pred = hierarchical.fit_predict(X)
plt.figure(figsize = (10, 10))
plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c = y_pred)
plt.xlabel("t-SNE feature 0")
plt.ylabel("t-SNE feature 1")
hierarchical = AgglomerativeClustering(n_clusters = 8, affinity='euclidean', linkage = 'average')
y_pred = hierarchical.fit_predict(X)
plt.figure(figsize = (10, 10))
plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c = y_pred)
plt.xlabel("t-SNE feature 0")
plt.ylabel("t-SNE feature 1")
dbscan = DBSCAN(eps = 3.9, min_samples = 4)
y_pred = dbscan.fit_predict(X)
len(np.unique(y_pred))
plt.figure(figsize = (10, 10))
plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c = y_pred)
plt.xlabel("t-SNE feature 0")
plt.ylabel("t-SNE feature 1")